Working idea: This network will then be used to identify hub genes for each of the early, middle and late interactions.
Load Phytophthora cinnamomi gtf file, clean up to get Gene IDs
Combine alternatively spliced transcripts (seen in transcript_id as -RA and RB)
suppressPackageStartupMessages({c(
library(WGCNA),
library(DESeq2),
library(GEOquery),
library(tidyverse),
library(gridExtra),
library(reshape2),
library(rtracklayer),
library(dplyr),
library(edgeR))})
[1] "WGCNA" "fastcluster" "dynamicTreeCut" "stats" "graphics"
[6] "grDevices" "utils" "datasets" "methods" "base"
[11] "DESeq2" "SummarizedExperiment" "Biobase" "MatrixGenerics" "matrixStats"
[16] "GenomicRanges" "GenomeInfoDb" "IRanges" "S4Vectors" "BiocGenerics"
[21] "stats4" "WGCNA" "fastcluster" "dynamicTreeCut" "stats"
[26] "graphics" "grDevices" "utils" "datasets" "methods"
[31] "base" "GEOquery" "DESeq2" "SummarizedExperiment" "Biobase"
[36] "MatrixGenerics" "matrixStats" "GenomicRanges" "GenomeInfoDb" "IRanges"
[41] "S4Vectors" "BiocGenerics" "stats4" "WGCNA" "fastcluster"
[46] "dynamicTreeCut" "stats" "graphics" "grDevices" "utils"
[51] "datasets" "methods" "base" "lubridate" "forcats"
[56] "stringr" "dplyr" "purrr" "readr" "tidyr"
[61] "tibble" "ggplot2" "tidyverse" "GEOquery" "DESeq2"
[66] "SummarizedExperiment" "Biobase" "MatrixGenerics" "matrixStats" "GenomicRanges"
[71] "GenomeInfoDb" "IRanges" "S4Vectors" "BiocGenerics" "stats4"
[76] "WGCNA" "fastcluster" "dynamicTreeCut" "stats" "graphics"
[81] "grDevices" "utils" "datasets" "methods" "base"
[86] "gridExtra" "lubridate" "forcats" "stringr" "dplyr"
[91] "purrr" "readr" "tidyr" "tibble" "ggplot2"
[96] "tidyverse" "GEOquery" "DESeq2" "SummarizedExperiment" "Biobase"
[101] "MatrixGenerics" "matrixStats" "GenomicRanges" "GenomeInfoDb" "IRanges"
[106] "S4Vectors" "BiocGenerics" "stats4" "WGCNA" "fastcluster"
[111] "dynamicTreeCut" "stats" "graphics" "grDevices" "utils"
[116] "datasets" "methods" "base" "reshape2" "gridExtra"
[121] "lubridate" "forcats" "stringr" "dplyr" "purrr"
[126] "readr" "tidyr" "tibble" "ggplot2" "tidyverse"
[131] "GEOquery" "DESeq2" "SummarizedExperiment" "Biobase" "MatrixGenerics"
[136] "matrixStats" "GenomicRanges" "GenomeInfoDb" "IRanges" "S4Vectors"
[141] "BiocGenerics" "stats4" "WGCNA" "fastcluster" "dynamicTreeCut"
[146] "stats" "graphics" "grDevices" "utils" "datasets"
[151] "methods" "base" "rtracklayer" "reshape2" "gridExtra"
[156] "lubridate" "forcats" "stringr" "dplyr" "purrr"
[161] "readr" "tidyr" "tibble" "ggplot2" "tidyverse"
[166] "GEOquery" "DESeq2" "SummarizedExperiment" "Biobase" "MatrixGenerics"
[171] "matrixStats" "GenomicRanges" "GenomeInfoDb" "IRanges" "S4Vectors"
[176] "BiocGenerics" "stats4" "WGCNA" "fastcluster" "dynamicTreeCut"
[181] "stats" "graphics" "grDevices" "utils" "datasets"
[186] "methods" "base" "rtracklayer" "reshape2" "gridExtra"
[191] "lubridate" "forcats" "stringr" "dplyr" "purrr"
[196] "readr" "tidyr" "tibble" "ggplot2" "tidyverse"
[201] "GEOquery" "DESeq2" "SummarizedExperiment" "Biobase" "MatrixGenerics"
[206] "matrixStats" "GenomicRanges" "GenomeInfoDb" "IRanges" "S4Vectors"
[211] "BiocGenerics" "stats4" "WGCNA" "fastcluster" "dynamicTreeCut"
[216] "stats" "graphics" "grDevices" "utils" "datasets"
[221] "methods" "base" "edgeR" "limma" "rtracklayer"
[226] "reshape2" "gridExtra" "lubridate" "forcats" "stringr"
[231] "dplyr" "purrr" "readr" "tidyr" "tibble"
[236] "ggplot2" "tidyverse" "GEOquery" "DESeq2" "SummarizedExperiment"
[241] "Biobase" "MatrixGenerics" "matrixStats" "GenomicRanges" "GenomeInfoDb"
[246] "IRanges" "S4Vectors" "BiocGenerics" "stats4" "WGCNA"
[251] "fastcluster" "dynamicTreeCut" "stats" "graphics" "grDevices"
[256] "utils" "datasets" "methods" "base"
Load GO terms blasted from Omicsbox
pc.gtf <- rtracklayer::import("https://ftp.ncbi.nlm.nih.gov/genomes/genbank/protozoa/Phytophthora_cinnamomi/latest_assembly_versions/GCA_018691715.1_ASM1869171v1/GCA_018691715.1_ASM1869171v1_genomic.gtf.gz")
trying URL 'https://ftp.ncbi.nlm.nih.gov/genomes/genbank/protozoa/Phytophthora_cinnamomi/latest_assembly_versions/GCA_018691715.1_ASM1869171v1/GCA_018691715.1_ASM1869171v1_genomic.gtf.gz'
Content type 'application/x-gzip' length 2722947 bytes (2.6 MB)
==================================================
downloaded 2.6 MB
pc.df <- as.data.frame((pc.gtf))
pc.genes <- filter(pc.df, type == "start_codon")
pc.geneids <- select(pc.genes, gene_id, product, protein_id)
pc.annotations <- pc.geneids %>%
distinct(gene_id, .keep_all = T)
nrow(pc.annotations)
[1] 19969
pcannot <- as.matrix(pc.annotations)
#write.table(pcannot, file = "pcgeneids.tsv",quote=FALSE,sep="\t")
Make GO term list for enrichment analysis
goterms <- read.delim('pc.goterms.txt', header = TRUE) %>%
select(c(SeqName, GO.IDs, GO.Names, Description))
head(goterms)
colnames(goterms) <- c("protein_id", "GO_terms", "GO_descripton", "description")
goterms.df <- dplyr::right_join(goterms, pc.annotations, by = 'protein_id')
nrow(goterms.df)
[1] 19969
Assemble count matrix and coldata file
Warning: `as.tibble()` was deprecated in tibble 2.0.0.
Please use `as_tibble()` instead.
The signature and semantics have changed, see `?as_tibble`.
[1] "F:GO:0005515" "P:GO:0015074" "F:GO:0003676" "F:GO:0008270" "P:GO:0006355" "F:GO:0003700" "F:GO:0005509" "P:GO:0036297"
[9] "C:GO:0043240" "P:GO:0006629" "F:GO:0020037" "P:GO:0005975" "P:GO:0006508" "F:GO:0030248" "C:GO:0005576" "P:GO:0006801"
[17] "F:GO:0046872" "F:GO:0005525" "P:GO:0007076" "C:GO:0016020"
[1] "F:GO:0005515 F:protein binding" "P:GO:0015074 P:DNA integration"
[3] "F:GO:0003676 F:nucleic acid binding" "F:GO:0008270 F:zinc ion binding"
[5] "P:GO:0006355 P:regulation of DNA-templated transcription" "F:GO:0003700 F:DNA-binding transcription factor activity"
Import files into r
tmp<-read.table("3col.tsv.gz",header=F)
x<-as.matrix(acast(tmp, V2~V1, value.var="V3"))
colnames(x)<-sapply(strsplit(colnames(x),"_"),"[[",1)
#head(x)
#write.table(x,file="countmatrix.tsv",quote=FALSE,sep="\t")
Clean countmatrix for P. cinnamomi analysis
coldata <- read.table('sample_info.tsv')
Pcgenenames <- read.table('pcgeneids.tsv', row.names = 1, quote = "", sep='\t', fill = TRUE, header = TRUE)
IUM83Tanjilcountmatrix <- read.table('countmatrix.tsv')
Clean coldata for Deseq2 normalisation
collabels <- colnames(IUM83Tanjilcountmatrix) <- (coldata$Sample)
colnames(IUM83Tanjilcountmatrix) <- collabels
IUM83Tanjilcountmatrix <- tibble::rownames_to_column(IUM83Tanjilcountmatrix, "gene_id")
IUM83CountMatrix <- IUM83Tanjilcountmatrix %>%
filter(str_detect(gene_id, "IUM83"))
rownames(IUM83CountMatrix) <- IUM83CountMatrix$gene_id
IUM83CountMatrix <- IUM83CountMatrix [ ,-1]
IUM83CountMatrix <- as.data.frame(IUM83CountMatrix)
IUM83CountMatrix <- IUM83CountMatrix[ ,-(1:12)]
#write.table(IUM83CountMatrix, file = "Pc_countmatrix.tsv",quote=FALSE,sep="\t")
Detect and remove outlier genes
colData <- coldata %>%
filter(str_detect(Sample, "Pc"))
rownames(colData) <- colData$Sample
colData <- colData [ ,-1]
Detect outlier samples
#Call a function from WGCNA package that detects outliers
#Rows should be the samples and the columns genes
gsg <- goodSamplesGenes(t(IUM83CountMatrix))
Flagging genes and samples with too many missing values...
..step 1
..step 2
summary(gsg)
Length Class Mode
goodGenes 19981 -none- logical
goodSamples 12 -none- logical
allOK 1 -none- logical
#If false, then there are outliers
gsg$allOK #False
[1] FALSE
table(gsg$goodGenes) #3012to be excluded
FALSE TRUE
3012 16969
table(gsg$goodSamples) #All 12 samples passed
TRUE
12
# remove genes that are detectd as outliers
data <- IUM83CountMatrix[gsg$goodGenes == TRUE,]
The WGCNA library requires normalisation using the vst (variance-stabilising transform) function from the DESeq2 package
# detect outlier samples - hierarchical clustering
htree <- hclust(dist(t(data)), method = "average")
groups <- cutree(htree, k=2) # cut tree into clusters
plot(htree, labels(groups))
# draw dendogram with red borders around the clusters
rect.hclust(htree, k=2, border="red")
# exclude outlier samples from the column data to match the new data
#colData <- colData %>%
# filter(!row.names(.) %in% outliers)
# making sure the rownames and column names identical for the two data sets
#all(rownames(colData) == colnames(clusters))
# create DESeq Data Set
dds <- DESeqDataSetFromMatrix(countData = data,
colData = colData,
design = ~ 1) # no model, for normalisation only
# remove all genes with counts < 10 in more than 90% of samples (12*0.9=10.8 ~ 11) as suggested by WGCNA on RNAseq FAQ (https://horvath.genetics.ucla.edu/html/CoexpressionNetwork/Rpackages/WGCNA/faq.html)
# <10 in more than 90% samples
dds90 <- dds[rowSums(counts(dds) >= 10) >= 11,]
nrow(dds90) # 5403 genes
[1] 5403
# >= 10) >= 11,] 5403 genes <===== #less than 10 counts in %90 of samples
# >= 15) >= 11,] 4420 genes
gene.list <- row.names(dds90)
#lapply(gene.list, write, "all_genes.txt", append=TRUE)
# perform variance stabilization
dds_norm <- vst(dds90)
-- note: fitType='parametric', but the dispersion trend was not well captured by the
function: y = a/x + b, and a local regression fit was automatically substituted.
specify fitType='local' or 'mean' to avoid this message next time.
# get normalized counts
norm.counts <- assay(dds_norm) %>%
t()
your_dds <- estimateSizeFactors(dds90)
your_dds <- estimateDispersions(your_dds)
gene-wise dispersion estimates
mean-dispersion relationship
-- note: fitType='parametric', but the dispersion trend was not well captured by the
function: y = a/x + b, and a local regression fit was automatically substituted.
specify fitType='local' or 'mean' to avoid this message next time.
final dispersion estimates
# Plot dispersion estimates and fits
plotDispEsts(your_dds, main = "Dispersion Trend with Local and Parametric Fits")
#Save normalised dataset for faster analysis next time
#saveRDS(norm.counts, "norm.counts.rds")
Create network and identify modules
Note: this chunk is set not to run since the blockwise Modules function takes a long time to run. The R object has been saved and is loaded in the next chunk for a faster run time.
# convert matrix to numeric
norm.counts[] <- sapply(norm.counts, as.numeric)
#Successively, hierarchical clustering was performed to identify modules
temp_cor <- cor
cor <- WGCNA::cor
# Parameters to be tweaked later
bwnet <- blockwiseModules(norm.counts,
maxBlockSize = 7500, # selected total number of genes since CPU memory is sufficient (32GB workstation should be able to handle perhaps 30000)
TOMType = "signed",
power = soft_power,
mergeCutHeight = 0.10,
saveTOMs = TRUE,
saveTOMFileBase = "pc_TOM",
numericLabels = FALSE, # set as false to assign color as labels
#randomSeed = 1243, # for reproducibility since this function uses clustering
verbose = 3)
Calculating module eigengenes block-wise from all genes
Flagging genes and samples with too many missing values...
..step 1
..Working on block 1 .
TOM calculation: adjacency..
..will not use multithreading.
Fraction of slow calculations: 0.000000
..connectivity..
..matrix multiplication (system BLAS)..
..normalization..
..done.
..saving TOM for block 1 into file pc_TOM-block.1.RData
....clustering..
....detecting modules..
....calculating module eigengenes..
....checking kME in modules..
..reassigning 1 genes from module 2 to modules with higher KME.
..reassigning 1 genes from module 3 to modules with higher KME.
..reassigning 4 genes from module 5 to modules with higher KME.
..merging modules that are too close..
mergeCloseModules: Merging modules whose distance is less than 0.1
Calculating new MEs...
# Save bwnet object
saveRDS(bwnet, "bwnet.rds")
cor <- temp_cor
# load bwnet object
bwnet <- readRDS("bwnet.rds")
module_eigengenes <- bwnet$MEs %>%
orderMEs(.)
head(module_eigengenes)
# Plot the dendrogram and the module colors before and after merging underneath
plotDendroAndColors(bwnet$dendrograms[[1]], cbind(bwnet$unmergedColors, bwnet$colors),
c("unmerged", "merged"),
dendroLabels = FALSE,
addGuide = TRUE,
hang= 0.03,
guideHang = 0.05)
# grey module = all genes that doesn't fall into other modules
module.total <- table(bwnet$colors)
# Define numbers of genes and samples
nSamples <- nrow(norm.counts)
nGenes <- ncol(norm.counts)
# correlation for module eigengenes and traits
module.trait.corr <- cor(module_eigengenes, traits, use = 'p')
module.trait.corr.pvals <- corPvalueStudent(module.trait.corr, nSamples)
# Heat map v2 (from WGCNA tutorial)
textMatrix = paste(signif(module.trait.corr, 2), "\n(",
signif(module.trait.corr.pvals, 1), ")", sep = "")
dim(textMatrix) = dim(module.trait.corr)
par(mar = c(6, 8.5, 3, 3));
labeledHeatmap(Matrix = module.trait.corr,
xLabels = colnames(module.trait.corr),
yLabels = rownames(module.trait.corr),
ySymbols = rownames(module.trait.corr),
colorLabels = FALSE,
colors = blueWhiteRed(50),
textMatrix = textMatrix,
textAdj = c(0.5, 0.5),
setStdMargins = FALSE,
cex.lab.y = 0.6,
cex.text = 0.7,
zlim = c(-1,1),
main = paste("Module-trait relationships"))
# get number of genes for each module
table(bwnet$colors)
black blue brown cyan green greenyellow grey lightcyan magenta
243 881 524 57 262 95 428 40 132
midnightblue pink purple red salmon tan turquoise yellow
47 201 98 249 65 85 1681 315
#Tag genes with module membership and store it in a table
module.gene.mapping <- as.data.frame(bwnet$colors)
module.gene.mapping
NA
Significance in brackets shows how significantly associated the
module (cluster of genes) is the trait of interest
Find modules that have significant association with disease state
Calculate the module membership and the associated p-values The module membership/intramodular connectivity is calculated as the correlation of the eigengene and the gene expression profile. This quantifies the similarity of all genes on the array to every module.
Using the gene significance you can identify genes that have a high significance for trait of interest Using the module membership measures you can identify genes with high module membership in interesting modules.
#create traits file - assign 1 if a sample is a certain stage, else assign 0
Dis_traits <- colData %>%
mutate(Dis.vs.all = ifelse(grepl('Treated', Treatment), 1, 0)) %>%
select(5)
factor_levels <- unique(colData$Stage)
# transform stages into factors and define levels
colData$Stage <- factor(colData$Stage, levels = factor_levels)
traits <- binarizeCategoricalColumns(colData$Stage,
#dropFirstLevelVsAll = FALSE,
includePairwise = FALSE,
includeLevelVsAll = TRUE,
minCount = 1)
traits <- cbind(Dis_traits, traits)
# Define a gene significance variable for Early
GS.Early <- as.numeric(cor(norm.counts, traits$data.Early.vs.all, use = "p"))
# This translates the numeric values into colors
GS.stage_earlyColor = numbers2colors(GS.Early, signed = T)
blocknumber = 1
moduleLabelsAutomatic = bwnet$colors
# Convert labels to colors for plotting
moduleColorsAutomatic = labels2colors(moduleLabelsAutomatic)
datColors = data.frame(bwnet$colors, GS.stage_earlyColor)[bwnet$blockGenes[[blocknumber]],
]
# Plot the dendrogram and the module colors underneath
plotDendroAndColors(bwnet$dendrograms[[blocknumber]], colors = datColors, groupLabels = c("Module colors",
"GS.stage.Early"), dendroLabels = FALSE, hang = 0.03, addGuide = TRUE, guideHang = 0.05)
earlygs <- early.gene.signf.corr %>%
as.data.frame() %>%
tibble::rownames_to_column("gene_id")
earlygsid <- dplyr::right_join(pc.annotations, earlygs, by = 'gene_id')
datKme.id <- as.data.frame(datKME) %>%
rownames_to_column("gene_id")
earlyeiginengene <- dplyr::right_join(earlygsid, datKme.id, by = 'gene_id')
datKME <- signedKME(norm.counts, module_eigengenes)
# Measure the correlation of the gene's module membership and the associated p-values
module.membership.measure <- cor(module_eigengenes, norm.counts, use = 'p')
module.membership.measure.pvals <- corPvalueStudent(module.membership.measure, nSamples)
# Calculate the gene significance and associated p-values
early.gene.signf.corr <- cor(norm.counts, traits$data.Early.vs.all, use = 'p')
early.gene.signf.corr.pvals <- corPvalueStudent(early.gene.signf.corr, nSamples)
colorOfColumn = substring(names(datKME), 4)
#selectModules = c("brown", "magenta", "blue", "turquoise", "lightcyan")
selectModules = colorOfColumn[colorOfColumn !="grey"]
#par(mfrow = c(4, length(selectModules)/4))
for (module in selectModules) {
column = match(module, colorOfColumn)
restModule = moduleColorsAutomatic == module
verboseScatterplot(datKME[restModule, column], GS.Early[restModule], xlab = paste("Module Membership",
module, "module"), ylab = "GS.stage_Early", main = paste("kME.", module,
"vs. GS"), col = module)
}
earlygs <- early.gene.signf.corr %>%
as.data.frame() %>%
tibble::rownames_to_column("gene_id")
earlygsid <- dplyr::right_join(pc.annotations, earlygs, by = 'gene_id')
datKme.id <- as.data.frame(datKME) %>%
rownames_to_column("gene_id")
earlyeiginengene <- dplyr::right_join(earlygsid, datKme.id, by = 'gene_id')
# Define a gene significance variable for Early
GS.mid <- as.numeric(cor(norm.counts, traits$data.Middle.vs.all, use = "p"))
# This translates the numeric values into colors
GS.stage_midColor = numbers2colors(GS.mid, signed = T)
blocknumber = 1
moduleLabelsAutomatic = bwnet$colors
# Convert labels to colors for plotting
moduleColorsAutomatic = labels2colors(moduleLabelsAutomatic)
datColors = data.frame(bwnet$colors, GS.stage_midColor)[bwnet$blockGenes[[blocknumber]],
]
# Plot the dendrogram and the module colors underneath
plotDendroAndColors(bwnet$dendrograms[[blocknumber]], colors = datColors, groupLabels = c("Module colors",
"GS.mid"), dendroLabels = FALSE, hang = 0.03, addGuide = TRUE, guideHang = 0.05)
datKME <- signedKME(norm.counts, module_eigengenes)
# Measure the correlation of the gene's module membership and the associated p-values
module.membership.measure <- cor(module_eigengenes, norm.counts, use = 'p')
module.membership.measure.pvals <- corPvalueStudent(module.membership.measure, nSamples)
# Calculate the gene significance and associated p-values
mid.gene.signf.corr <- cor(norm.counts, traits$data.Middle.vs.all, use = 'p')
mid.gene.signf.corr.pvals <- corPvalueStudent(mid.gene.signf.corr, nSamples)
colorOfColumn = substring(names(datKME), 4)
#selectModules = c("red", "tan", "salmon","green", "magenta")
selectModules = colorOfColumn[colorOfColumn !="grey"]
#par(mfrow = c(3, length(selectModules)/3))
for (module in selectModules) {
column = match(module, colorOfColumn)
restModule = moduleColorsAutomatic == module
verboseScatterplot(datKME[restModule, column], GS.mid[restModule], xlab = paste("Module Membership",
module, "module"), ylab = "GS.stage_mid", main = paste("kME.", module,
"vs. GS"), col = module)
}
midgs <- mid.gene.signf.corr %>%
as.data.frame() %>%
tibble::rownames_to_column("gene_id")
midgsid <- dplyr::right_join(Pcgenenames, midgs, by = 'gene_id')
datKme.id <- as.data.frame(datKME) %>%
rownames_to_column("gene_id")
mideiginengene <- dplyr::right_join(midgsid, datKme.id, by = 'gene_id')
turquoise module (Horvath cares not for the reported cor=). Instead, Look at the y-axis: genes that have high positive module membership tend to be highly positively correlated with the late interaction of Pc in lupin, whereas, genes with negative values in the module they have negative relations with the late interaction of Pc in lupin. Can select either; genes with high positive/negative GS or high kMe (hub genes)
# Define a gene significance variable for Early
GS.late <- as.numeric(cor(norm.counts, traits$data.Late.vs.all, use = "p"))
# This translates the numeric values into colors
GS.stage_lateColor = numbers2colors(GS.late, signed = T)
blocknumber = 1
moduleLabelsAutomatic = bwnet$colors
# Convert labels to colors for plotting
moduleColorsAutomatic = labels2colors(moduleLabelsAutomatic)
datColors = data.frame(bwnet$colors, GS.stage_lateColor)[bwnet$blockGenes[[blocknumber]],
]
# Plot the dendrogram and the module colors underneath
plotDendroAndColors(bwnet$dendrograms[[blocknumber]], colors = datColors, groupLabels = c("Module colors",
"GS.stage.Late"), dendroLabels = FALSE, hang = 0.03, addGuide = TRUE, guideHang = 0.05)
datKME <- signedKME(norm.counts, module_eigengenes)
# Measure the correlation of the gene's module membership and the associated p-values
module.membership.measure <- cor(module_eigengenes, norm.counts, use = 'p')
module.membership.measure.pvals <- corPvalueStudent(module.membership.measure, nSamples)
# Calculate the gene significance and associated p-values
late.gene.signf.corr <- cor(norm.counts, traits$data.Late.vs.all, use = 'p')
late.gene.signf.corr.pvals <- corPvalueStudent(late.gene.signf.corr, nSamples)
colorOfColumn = substring(names(datKME), 4)
#selectModules = c("green", "turquoise", "blue", "grey60", "salmon")
selectModules = colorOfColumn[colorOfColumn !="grey"]
#par(mfrow = c(3, length(selectModules)/3))
for (module in selectModules) {
column = match(module, colorOfColumn)
restModule = moduleColorsAutomatic == module
verboseScatterplot(datKME[restModule, column], GS.late[restModule], xlab = paste("Module Membership",
module, "module"), ylab = "GS.stage_Late", main = paste("kME.", module,
"vs. GS"), col = module)
}
lategs <- late.gene.signf.corr.pvals %>%
as.data.frame() %>%
tibble::rownames_to_column("gene_id")
lategsid <- dplyr::right_join(Pcgenenames, lategs, by = 'gene_id')
datKme.id <- as.data.frame(datKME) %>%
rownames_to_column("gene_id")
lateeiginengene <- dplyr::right_join(lategsid, datKme.id, by = 'gene_id')
library("clusterProfiler")
library("kableExtra")
writeGMT <- function (object, fname ){
if (class(object) != "list") stop("object should be of class 'list'")
if(file.exists(fname)) unlink(fname)
for (iElement in 1:length(object)){
write.table(t(c(make.names(rep(names(object)[iElement],2)),object[[iElement]])),
sep="\t",quote=FALSE,
file=fname,append=TRUE,col.names=FALSE,row.names=FALSE)
}
}
writeGMT(object=gul,fname="goterms.gmt")
genesets <- read.gmt("goterms.gmt")
bg <- gene.list
# chooseTopHubInEachModule returns the gene in each module with the highest connectivity, looking at all genes in the expression file
PcHubgenes <- chooseTopHubInEachModule(norm.counts,
colorh = module.gene.mapping,
omitColor = "grey",
power = 2,
type = "signed")
PcHubgenes
black blue brown cyan green greenyellow lightcyan magenta midnightblue
"IUM83_03164" "IUM83_12926" "IUM83_06000" "IUM83_08664" "IUM83_06699" "IUM83_02874" "IUM83_10668" "IUM83_04177" "IUM83_09489"
pink purple red salmon tan turquoise yellow
"IUM83_14330" "IUM83_02522" "IUM83_07133" "IUM83_02405" "IUM83_19502" "IUM83_15977" "IUM83_18642"
PcHubgenes.df <- PcHubgenes %>%
as.data.frame(row.names = NULL, stringAsFactors = FALSE)
PcHubgenes.df <- PcHubgenes.df %>%
rename("gene_id" = ".")
PcHubgenes.df <- tibble::rownames_to_column(PcHubgenes.df, "module")
PcHubgenes.df <- dplyr::right_join(pc.annotations, PcHubgenes.df, by = 'gene_id')
PcHubgenes.df
# Heatmap of old module eigen-genes and samples
#pdf(file="oldMEs.pdf",heigh=80,width=20)
library("pheatmap")
library(RColorBrewer)
MEs <- bwnet$MEs
rownames(MEs)=names(norm.counts[,9])
pheatmap(MEs,cluster_col=T,cluster_row=T,show_rownames=F,show_colnames=T,fontsize=6)
# Heatmap of new module eigen-genes and sample trait (e.g. Zone)
col_ann <- colData[,c(1,4)]
rownames(col_ann) <- col_ann$Sample
col_ann <- data.frame(col_ann)
col_ann$Stage <- as.factor(col_ann$Stage)
col_ann <- col_ann[order(col_ann$Stage),]
col_ann$sample_ID <- NULL
head(col_ann, 9)
ann_color <- list("col_ann" = c("In Planta" = "purple",
"Early" = "yellow",
"Middle" = "green",
"Late" = "blue"))
data.me <- data.frame(MEs)
data.me <- data.me[order(match(rownames(data.me), rownames(col_ann))),]
dim(data.me)
[1] 12 17
#pdf(file="newMEs.pdf",heigh=60,width=20)
rownames(MEs)=names(colData[ ,1])
pheatmap(data.me,cluster_col=T,cluster_row=F,show_rownames=F,
show_colnames=T,fontsize=6,
annotation_row = col_ann, annotation_colors = ann_color)
Set analysis.
get_module_genes <- function(eigengene_df) {
module_list <- c("red", "greenyellow", "purple", "lightcyan", "blue", "turquoise",
"green", "brown", "yellow", "pink", "tan", "magenta", "midnightblue",
"salmon", "black", "cyan")
gene_list <- list()
for (module in module_list) {
module_genes <- eigengene_df$gene_id[eigengene_df$V1 > 0.2 & eigengene_df[[paste0("kME", module)]] > 0.7]
gene_list[[module]] <- module_genes
}
return(gene_list)
}
module_genes_list_early <- get_module_genes(earlyeiginengene)
module_genes_list_mid <- get_module_genes(mideiginengene)
module_genes_list_late <- get_module_genes(lateeiginengene)
#module_list <- c("red", "greenyellow", "purple", "lightcyan", "blue", "turquoise",
# "green", "brown", "yellow", "pink", "tan", "magenta", "midnightblue",
# "salmon", "black", "cyan")
#gene_lists <- list(module_genes_list_early$blue, module_genes_list_mid$blue, module_genes_list_late$blue)
#results <- list()
for (i in seq_along(gene_lists)) {
module.color <- gene_lists[[i]]
ora_res <- as.data.frame(enricher(gene = module.color,
universe = bg,
maxGSSize = 5000,
TERM2GENE = genesets,
pAdjustMethod = "fdr",
pvalueCutoff = 1,
qvalueCutoff = 1))
ora_res$geneID <- NULL
ora_res <- subset(ora_res, (ora_res$p.adjust < 0.05 & ora_res$Count >= 5))
ora_res_names <- rownames(ora_res)
ora_res$GeneRatio <- as.character(ora_res$GeneRatio)
gr <- as.numeric(sapply(strsplit(as.character(ora_res$GeneRatio), "/"), "[[", 1)) /
as.numeric(sapply(strsplit(as.character(ora_res$GeneRatio), "/"), "[[", 2))
br <- as.numeric(sapply(strsplit(as.character(ora_res$BgRatio), "/"), "[[", 1)) /
as.numeric(sapply(strsplit(as.character(ora_res$BgRatio), "/"), "[[", 2))
ora_res$es <- gr/br
ora_res <- ora_res[order(-ora_res$es), ]
ora_res$Description <- NULL
ora_res$Time <- c("early", "mid", "late")[i]
results[[i]] <- ora_res
}
View(ora_res)